#Building a join of tax data and census data
#Currently this is where we break to using percentages instead of counts.


#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Set Directories -------------------------------------
dir = list(home   = "//jctstat16/public/demographic_tax_data/",
           data   = "R code/data/",
           raw    = "data/raw/",
           output = "R code/output/",
           code   = "R code/code/")

dir$hdata = paste0(dir$home,dir$data)
dir$hout  = paste0(dir$home,dir$output)
dir$hraw  = paste0(dir$home,dir$raw)
dir$hcode = paste0(dir$home,dir$code)
dir$boot_data = paste0(dir$hdata,"boot_samples/")
dir$boot_temp = paste0(dir$hdata,"boot_temp/")

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Load Packages -------------------------------------

library(tidyverse)
library(magrittr)
library(readxl)
library(broom)
library(backports) #only needed for fixest?
library(gridExtra) #needed for combining diff ggplot plots.
options("scipen"=100, "digits"=10)

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Grab data -------------------------------------

census_surnames = read_rds(paste0(dir$hdata,"census_surname.RDS"))
census_zips  = read_rds(paste0(dir$hdata,"census_zip.RDS"))
harvard_fn_race <- read.csv("Z:/Race_Prediction/Data/Harvard_name_data/first_raceNameProbs.csv")

#Load LIHTC validation data (names, zip codes, true race)
load("Z:/Race_Prediction/Data/lihtc_validation/lihtc_17_clean.RData")
lihtc <- lihtc_final

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Clean up Zips -------------------------------------
#
prior = 0.5 #jeffreys non-informative prior for a multinomial is a
#dirichlet((0.5,...,0.5)), which this mimics the posterior of.

census_zips %<>%
	mutate(across(starts_with("rg"), ~.x+prior),
		   pop = pop+6*prior) %>%
	filter(!is.na(zip))
national = census_zips %>% summarize(across(-zip,sum))
census_zips %<>% add_row(national,.before=1)
rm(national)
census_zips %<>%
	mutate(across(starts_with("rg"),
		   list(prob_gr = ~ .x/.x[1],
		  	    prob_ = ~ .x/pop), #turns into prob_rg
		   .names = "{.fn}{.col}")) %>%
	rename_with(~paste0("prob_gr",str_remove_all(.x,"prob_grrg")),
				starts_with("prob_grrg")) %>%
	select(zip,starts_with("prob_rg"),starts_with("prob_gr"))


#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Clean up Surnames -------------------------------------
#

census_surnames %<>%
	mutate(across(starts_with("cs"),~ .x+prior),
		   sur_count = sur_count + prior*6,
		   across(starts_with("cs"), ~ .x/sur_count)) %>%
	rename_with(~paste0("prob_s",str_remove(.x,"cs")),starts_with("cs")) %>%
	mutate(surname = na_if(surname,"")) %>%
	select(-sur_rank,-sur_count)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Add prior to P[FN|R] calculations-----------------------------------
#

#Note: There are not "native" or "multiracial" categories
harvard_fn_race %<>%
  mutate(firstname = tolower(name),
         f_1 = whi,
         f_2 = bla,
         f_3 = asi,
         f_4 = oth/2,    
         f_5 = oth/2,   
         f_6 = his) %>%
  select(firstname,starts_with("f_"))

#The total populations are given in a table
totalpop <- 34300000
pop1 <- totalpop*0.628 
pop2 <- totalpop*0.218
pop3 <- totalpop*0.016
pop4 <- totalpop*0.029
pop5 <- totalpop*0.030
pop6 <- totalpop*0.079

#calculate populations, adding prior
harvard_fn_race %<>%
  mutate(p1 = f_1*pop1 + prior,
         p2 = f_2*pop2 + prior,
         p3 = f_3*pop3 + prior,
         p4 = f_4*pop4 + prior,
         p5 = f_5*pop5 + prior,
         p6 = f_6*pop6 + prior,
         fname_count = p1+p2+p3+p4+p5+p6)

pop1_prior <- sum(harvard_fn_race$p1)
pop2_prior <- sum(harvard_fn_race$p2)
pop3_prior <- sum(harvard_fn_race$p3)
pop4_prior <- sum(harvard_fn_race$p4)
pop5_prior <- sum(harvard_fn_race$p5)
pop6_prior <- sum(harvard_fn_race$p6)
totpop_prior <- sum(harvard_fn_race$fname_count)
totpop_prior-pop1_prior-pop2_prior-pop3_prior-pop4_prior-pop5_prior-pop6_prior

harvard_fnames <- harvard_fn_race

#Calculate probabilities using prior
harvard_fnames %<>%
  mutate(prob_f1 = p1/pop1_prior,
         prob_f2 = p2/pop2_prior,
         prob_f3 = p3/pop3_prior,
         prob_f4 = p4/pop4_prior,
         prob_f5 = p5/pop5_prior,
         prob_f6 = p6/pop6_prior) %>%
  select(firstname,starts_with("prob_f"))

#Eliminate single letter predictions
harvard_fnames <- subset(harvard_fnames,nchar(harvard_fnames$firstname)>1)


#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Join LIHTC Data on Zip code ----------------------------------------
#
#Turn census zips into numeric
census_zips$zip <- as.numeric(census_zips$zip)
lihtc$zip <- as.numeric(as.character(lihtc$zip))

lihtc_join <- lihtc

#For zips that don't match, assign national average.
lihtc_join$zip <- ifelse((lihtc_join$zip %in% census_zips$zip)==FALSE,0,lihtc_join$zip)
census_zips$zip <- ifelse(is.na(census_zips$zip)==TRUE,0,census_zips$zip)

#Merge
lihtc_join <- merge(lihtc_join,census_zips,by="zip",all.x=TRUE,all.y=FALSE)
	

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Join LIHTC Data on Census Names-------------------------------------

lihtc_join$surname <- lihtc_join$l1
lihtc_join$l1_w1 <- sapply(strsplit(lihtc_join$l1, " "), "[", 1)
lihtc_join$l1_w2 <- sapply(strsplit(lihtc_join$l1, " "), "[", 2)

#If there are two last names, try to match on the second last name, then match on the first
lihtc_join$surname <- ifelse((lihtc_join$surname %in% census_surnames$surname)==FALSE &
                               is.na(lihtc_join$l1_w2)==FALSE,
                             lihtc_join$l1_w2,lihtc_join$surname)
lihtc_join$surname <- ifelse((lihtc_join$surname %in% census_surnames$surname)==FALSE &
                               is.na(lihtc_join$l1_w1)==FALSE,
                             lihtc_join$l1_w1,lihtc_join$surname)

census_surnames$surname <- ifelse(is.na(census_surnames$surname)==TRUE,"all other names",census_surnames$surname)

#Merge
lihtc_join <- merge(lihtc_join,census_surnames,by="surname",all.x=TRUE,all.y=FALSE)

#There are a few people with l1="NA" and we will just drop them for now
lihtc_bisg <- subset(lihtc_join,is.na(lihtc_join$l1)==FALSE)

#The rest get the "NA_surname" statistic
sn1_aon <- subset(census_surnames$prob_s1,census_surnames$surname=="all other names")
sn2_aon <- subset(census_surnames$prob_s2,census_surnames$surname=="all other names")
sn3_aon <- subset(census_surnames$prob_s3,census_surnames$surname=="all other names")
sn4_aon <- subset(census_surnames$prob_s4,census_surnames$surname=="all other names")
sn5_aon <- subset(census_surnames$prob_s5,census_surnames$surname=="all other names")
sn6_aon <- subset(census_surnames$prob_s6,census_surnames$surname=="all other names")

lihtc_bisg %<>%
  mutate(prob_s1 = ifelse(is.na(prob_s1)==TRUE,sn1_aon,prob_s1),
         prob_s2 = ifelse(is.na(prob_s2)==TRUE,sn2_aon,prob_s2),
         prob_s3 = ifelse(is.na(prob_s3)==TRUE,sn3_aon,prob_s3),
         prob_s4 = ifelse(is.na(prob_s4)==TRUE,sn4_aon,prob_s4),
         prob_s5 = ifelse(is.na(prob_s5)==TRUE,sn5_aon,prob_s5),
         prob_s6 = ifelse(is.na(prob_s6)==TRUE,sn6_aon,prob_s6))

#+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Join LIHTC Data on Harvard First Names-------------------------------------

lihtc_bisg$firstname <- lihtc_bisg$f1

#Merge with first name probabilities
lihtc_bisg <- merge(lihtc_bisg,harvard_fnames,by="firstname",all.x=TRUE,all.y=FALSE)

#Replace NA values of first name with "all other names" data
fn1_aon <- subset(harvard_fnames$prob_f1,harvard_fnames$firstname=="all other names")
fn2_aon <- subset(harvard_fnames$prob_f2,harvard_fnames$firstname=="all other names")
fn3_aon <- subset(harvard_fnames$prob_f3,harvard_fnames$firstname=="all other names")
fn4_aon <- subset(harvard_fnames$prob_f4,harvard_fnames$firstname=="all other names")
fn5_aon <- subset(harvard_fnames$prob_f5,harvard_fnames$firstname=="all other names")
fn6_aon <- subset(harvard_fnames$prob_f6,harvard_fnames$firstname=="all other names")

lihtc_bisg %<>%
  mutate(prob_f1 = ifelse(is.na(prob_f1)==TRUE,fn1_aon,prob_f1),
         prob_f2 = ifelse(is.na(prob_f2)==TRUE,fn2_aon,prob_f2),
         prob_f3 = ifelse(is.na(prob_f3)==TRUE,fn3_aon,prob_f3),
         prob_f4 = ifelse(is.na(prob_f4)==TRUE,fn4_aon,prob_f4),
         prob_f5 = ifelse(is.na(prob_f5)==TRUE,fn5_aon,prob_f5),
         prob_f6 = ifelse(is.na(prob_f6)==TRUE,fn6_aon,prob_f6))

save <- lihtc_bisg

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Generate BIFSG -------------------------------------

#Calculate BIFSG
lihtc_bisg %<>%
	mutate(prob_b1 = prob_s1*prob_f1*prob_gr1,
		   prob_b2 = prob_s2*prob_f2*prob_gr2,
		   prob_b3 = prob_s3*prob_f3*prob_gr3,
		   prob_b4 = prob_s4*prob_f4*prob_gr4,
		   prob_b5 = prob_s5*prob_f5*prob_gr5,
		   prob_b6 = prob_s6*prob_f6*prob_gr6,
		   denom = prob_b1+prob_b2+prob_b3+
		   	prob_b4+prob_b5+prob_b6,
		   across(starts_with("prob_b"),
		   	   ~.x/denom)) %>%
	select(-denom,-starts_with("prob_s"),-starts_with("prob_f"),
	       -starts_with("l1_"))



#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Spouse Surname Probs -----------------------------
#
# Again, join with census surnames

lihtc_bisg$sp_surname <- lihtc_bisg$l2

lihtc_bisg$l2_w1 <- sapply(strsplit(lihtc_bisg$l2, " "), "[", 1)
lihtc_bisg$l2_w2 <- sapply(strsplit(lihtc_bisg$l2, " "), "[", 2)

#Change name for the merge
colnames(census_surnames)[1] <- "sp_surname"

lihtc_bisg$sp_surname <- ifelse((lihtc_bisg$sp_surname %in% census_surnames$sp_surname)==FALSE &
                               is.na(lihtc_bisg$l2_w2)==FALSE,
                             lihtc_bisg$l2_w2,lihtc_bisg$sp_surname)
lihtc_bisg$sp_surname <- ifelse((lihtc_bisg$sp_surname %in% census_surnames$sp_surname)==FALSE &
                               is.na(lihtc_bisg$l2_w1)==FALSE,
                             lihtc_bisg$l2_w1,lihtc_bisg$sp_surname)

lihtc_bisg$sp_surname <- ifelse(is.na(lihtc_bisg$spouse_ssn)==TRUE,NA,
                                lihtc_bisg$sp_surname)

#Merge
lihtc_bisg <- merge(lihtc_bisg,census_surnames,by="sp_surname",all.x=TRUE,all.y=FALSE)

lihtc_bisg %<>%
  mutate(prob_s1 = ifelse(is.na(prob_s1)==TRUE &
                            is.na(spouse_ssn)==FALSE,sn1_aon,prob_s1),
         prob_s2 = ifelse(is.na(prob_s2)==TRUE &
                            is.na(spouse_ssn)==FALSE,sn2_aon,prob_s2),
         prob_s3 = ifelse(is.na(prob_s3)==TRUE &
                            is.na(spouse_ssn)==FALSE,sn3_aon,prob_s3),
         prob_s4 = ifelse(is.na(prob_s4)==TRUE &
                            is.na(spouse_ssn)==FALSE,sn4_aon,prob_s4),
         prob_s5 = ifelse(is.na(prob_s5)==TRUE &
                            is.na(spouse_ssn)==FALSE,sn5_aon,prob_s5),
         prob_s6 = ifelse(is.na(prob_s6)==TRUE &
                            is.na(spouse_ssn)==FALSE,sn6_aon,prob_s6))
#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Spouse Firstname Probs ---------------------------

lihtc_bisg$sp_firstname <- lihtc_bisg$f2
lihtc_bisg$sp_firstname <- ifelse(is.na(lihtc_bisg$spouse_ssn)==TRUE,NA,
                                lihtc_bisg$sp_firstname)


#Merge with first name probabilities
colnames(harvard_fnames)[1] <- "sp_firstname"
lihtc_bisg <- merge(lihtc_bisg,harvard_fnames,by="sp_firstname",all.x=TRUE,all.y=FALSE)

#Replace NA values of first name with "all other names" data
fn1_aon <- subset(harvard_fnames$prob_f1,harvard_fnames$sp_firstname=="all other names")
fn2_aon <- subset(harvard_fnames$prob_f2,harvard_fnames$sp_firstname=="all other names")
fn3_aon <- subset(harvard_fnames$prob_f3,harvard_fnames$sp_firstname=="all other names")
fn4_aon <- subset(harvard_fnames$prob_f4,harvard_fnames$sp_firstname=="all other names")
fn5_aon <- subset(harvard_fnames$prob_f5,harvard_fnames$sp_firstname=="all other names")
fn6_aon <- subset(harvard_fnames$prob_f6,harvard_fnames$sp_firstname=="all other names")

lihtc_bisg %<>%
  mutate(prob_f1 = ifelse(is.na(prob_f1)==TRUE,fn1_aon,prob_f1),
         prob_f2 = ifelse(is.na(prob_f2)==TRUE,fn2_aon,prob_f2),
         prob_f3 = ifelse(is.na(prob_f3)==TRUE,fn3_aon,prob_f3),
         prob_f4 = ifelse(is.na(prob_f4)==TRUE,fn4_aon,prob_f4),
         prob_f5 = ifelse(is.na(prob_f5)==TRUE,fn5_aon,prob_f5),
         prob_f6 = ifelse(is.na(prob_f6)==TRUE,fn6_aon,prob_f6))

#+++++++++++++++++++++++++++++++++++++++++++++++++++++
# * Generate Spouse BISG -------------------------------------
#

#Calculate BIFSG
lihtc_bisg %<>%
  mutate(sp_prob_b1 = ifelse(is.na(spouse_ssn)==TRUE,NA,
                             prob_s1*prob_f1*prob_gr1),
         sp_prob_b2 = ifelse(is.na(spouse_ssn)==TRUE,NA,
                             prob_s2*prob_f2*prob_gr2),
         sp_prob_b3 = ifelse(is.na(spouse_ssn)==TRUE,NA,
                             prob_s3*prob_f3*prob_gr3),
         sp_prob_b4 = ifelse(is.na(spouse_ssn)==TRUE,NA,
                             prob_s4*prob_f4*prob_gr4),
         sp_prob_b5 = ifelse(is.na(spouse_ssn)==TRUE,NA,
                             prob_s5*prob_f5*prob_gr5),
         sp_prob_b6 = ifelse(is.na(spouse_ssn)==TRUE,NA,
                             prob_s6*prob_f6*prob_gr6),
         denom = sp_prob_b1+sp_prob_b2+sp_prob_b3+
           sp_prob_b4+sp_prob_b5+sp_prob_b6,
         across(starts_with("sp_prob_b"),
                ~.x/denom)) %>%
  select(-denom,-starts_with("prob_s"),-starts_with("prob_f"),
         -starts_with("prob_gr"),-starts_with("prob_rg"),
         -starts_with("l2_"))

lihtc_bisg <- lihtc_bisg[order(lihtc_bisg$filer_ssn),]

#Save
save(lihtc_bisg, file = "Z:/Race_Prediction/Data/NTJ short paper/lihtc_17_biFsg.RData")



